library(limma)
library(DESeq2)
library(biomaRt)
library(org.Hs.eg.db)
library(readxl)
library(reshape2)
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(caret)
library(steFunctions)
library(EnsDb.Hsapiens.v79)
library(EnsDb.Hsapiens.v75)
library(heatmap3)
library(tidyverse)
library(ComplexHeatmap)
library(circlize)
library(doMC)
library(pROC)
#define function to get color gradient
getGradientColors <- function(colValue, minRange = NULL, maxRange = NULL, ccolors = c('green3', 'red'), bbreak = 0.01){
library(heatmap3)
if(is.null(minRange)){
minRange <- min(colValue, na.rm = T) - 0.001
}
if(is.null(maxRange)){
maxRange <- max(colValue, na.rm = T) + 0.001
}
bre <- seq(minRange, maxRange, bbreak)
colori <- colByValue(colValue, col = colorRampPalette(ccolors)(length(bre)-1), breaks= bre, cex.axis = 0.8)
colori <- as.vector(colori)
graphics.off()
names(colori) <- names(colValue)
return(colori)
}
MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
wdd="/home/mclaudia/MEDIA Project/OAStratification-pipeline/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"
load(paste0(MEDIAsave,"genide-bulk-unpaired_DE.RData"),verbose=TRUE) #for TPM duplicate fix
## Loading objects:
## OAvsnonOA
up_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=1)
dw_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=2)
ensdf <- GenomicFeatures::genes(EnsDb.Hsapiens.v79, return.type="DataFrame")
ens2gene <- as.data.frame(ensdf[,c("gene_id","gene_name")])
ensdf19 <- GenomicFeatures::genes(EnsDb.Hsapiens.v75, return.type="DataFrame")
ens2gene19 <- as.data.frame(ensdf19[,c("gene_id","gene_name")])
load(paste0(wdd,"data/txi.RData"))
patientDetails<-read.csv(paste0(wdd,"data/patientDetails_all_withMed.csv"),stringsAsFactors = F)
patientDetails <-patientDetails[,-3]
patientDetails<-patientDetails[!duplicated(patientDetails$name),]
patientDetails$OA<-ifelse(grepl("SH0", patientDetails$name),"OA","NonOA")
tmp <- txi$abundance[,match(unique(as.character(patientDetails$name)),colnames(txi$abundance))] #collect TPM matrix
keep <- rowSums(tmp > 0) >= 35 #TPMs > 0 in at least 50% of the samples
tmp <-tmp[keep,]
tmp <-cbind(gene_id=rownames(tmp), tmp)
tmp<-merge(tmp,ens2gene,by.y="gene_id")
tmp <-cbind(tmp$gene_name, tmp[,-72])
colnames(tmp)[1] <-"gene_name"
rownames(tmp) <-tmp$gene_id
dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp[,1])
tpm.mat_maintain <-tmp[!tmp$gene_name %in% gn_dup,]
tpm2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tpm.mat_maintain[,-c(1,2)])))
colnames(tpm2) <-colnames(tpm.mat_maintain)[-c(1,2)]
rownames(tpm2) <-paste0("ENS_",1:length(gn_dup))
for(i in 1:length(gn_dup)){
x=gn_dup[i]
y=tmp[tmp[,1] %in% x,-c(1,2)]
tpm2[i,1:ncol(tpm2)] <-apply(y,2,function(x) round(mean(as.numeric(x)),3))
}
tpm2 <-cbind(gene_name=gn_dup,tpm2)
tpm2 <-tpm2[,colnames(tpm.mat_maintain)[-2]]
tpm3 <-rbind(tpm.mat_maintain[,-2],as.matrix(tpm2))
rownames(tpm3) <-tpm3[,1]
tpm.mat <-t(apply(tpm3[,-1],1,as.numeric))
colnames(tpm.mat) <-colnames(tpm3)[-1]
dim(tpm.mat)
## [1] 17804 70
train_tpm <-log2(tpm.mat+0.001)
boxplot(train_tpm)
load(paste0(MEDIAsave,"datasetValidation_full.RData"),verbose = TRUE)
## Loading objects:
## newdataset
tmp<-newdataset
genes <- rownames(tmp)
geneAnn <-ens2gene19[ens2gene19$gene_name %in% genes,]
out <- as.data.frame(EDASeq::getGeneLengthAndGCContent(id = geneAnn$gene_id, org = 'hg19',mode = "org.db"))
out$gene_id <-rownames(out)
out <-merge(geneAnn,out,by="gene_id")
out <-out[complete.cases(out),]
out <-out[!duplicated(out$gene_name),]
rownames(out) <-out$gene_name
gene_length <-out[,3,drop=FALSE]
tmp <-tmp[rownames(gene_length),]
dim(tmp)
## [1] 20685 38
identical(rownames(tmp),rownames(out)) #TRUE
## [1] TRUE
y <- apply(tmp,2,function(x) x/as.numeric(gene_length[,1]))
tpm.mat2 <- t((t(y) * 1e6)/colSums(y))
head(colSums(tpm.mat2))
## Normal_Cart_10_8 Normal_Cart_2_2 Normal_Cart_3_3 Normal_Cart_4_4
## 1e+06 1e+06 1e+06 1e+06
## Normal_Cart_5_5 Normal_Cart_6_6
## 1e+06 1e+06
keep <- rowSums(tpm.mat2 > 0) >= 19 #TPMs > 0 in at least 50% of the samples
tpm.mat2 <-tpm.mat2[keep,]
validation_tpm <-log2(tpm.mat2+0.001)
boxplot(validation_tpm)
rm(y)
ii=intersect(rownames(train_tpm),rownames(validation_tpm)) #select common genes,14565
train_tpm <-train_tpm[ii,]
validation_tpm <-validation_tpm[ii,]
dim(train_tpm)
## [1] 14565 70
dim(validation_tpm)
## [1] 14565 38
train_tpm <-t(quantileNormalization(t(train_tpm)))
list_ranks <- as.numeric(sort(train_tpm[,1],decreasing=TRUE))
head(list_ranks)
## [1] 15.90471 14.80488 14.42682 14.01457 13.75508 13.53887
list_val <-vector("list",ncol(validation_tpm))
names(list_val) <-colnames(validation_tpm)
list_val <-apply(validation_tpm, 2, function(x) as.list(sort(x,decreasing = TRUE)))
list_valRank <-vector("list",ncol(validation_tpm))
names(list_valRank) <-colnames(validation_tpm)
for(i in 1:length(list_val)){
gg <-names(unlist(list_val[[i]]))
list_valRank[[i]] <-list_ranks
names(list_valRank[[i]]) <- gg
}
normdata2 <-matrix(NA,nrow = nrow(validation_tpm), ncol=ncol(validation_tpm))
rownames(normdata2) <-rownames(validation_tpm)
colnames(normdata2) <-colnames(validation_tpm)
for(i in 1:length(list_valRank)){
tmp <-as.data.frame(list_valRank[[i]])
rownames(tmp) <-names(list_valRank[[i]])
normdata2[,i] <-tmp[rownames(normdata2),]
}
boxplot(normdata2) #Validation re-scaled on quantile normalized reference
train_tpm_forRidge <- train_tpm
validation_tpm_forRidge <- normdata2
validation_tpm <-normdata2[rownames(normdata2) %in% c(up_genes$gene,dw_genes$gene),] #select consensus genes present
dim(validation_tpm)
## [1] 43 38
train_tpm <-train_tpm[rownames(train_tpm) %in% rownames(validation_tpm),]
dim(train_tpm)
## [1] 43 70
set.seed(7894)
runs <-vector("list",100)
names(runs) <-paste0("run_",1:100)
runs <-lapply(runs,function(x) x <- sample(11:70, 10, replace = FALSE))
subsets <-vector("list",100)
names(subsets) <-paste0("run_",1:100)
for(i in 1:length(subsets)) subsets[[i]] <-train_tpm[,c(1:10,as.numeric(runs[[i]]))] #create subsets
trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
verboseIter = TRUE, returnData = FALSE,
savePredictions = TRUE,
classProbs = TRUE)
ClassBoot=as.factor(c(rep("N",10),rep("OA",10)))
list_tunes <-vector("list",100)
names(list_tunes) <-paste0("tunes_",1:100)
nCores <- 150 #change number of threads if needed
registerDoMC(nCores)
for(i in 1:length(subsets)){
train_dataset <-t(subsets[[i]])
train <- train(train_dataset,
y = ClassBoot,
method = "glmnet",
trControl = trainCtrl.rpcv,
metric = "Accuracy", allowParallel=TRUE,
tuneGrid = expand.grid(alpha = 0.5, #for elastic net, median between 0 and 1
lambda = seq(0.001,1,by = 0.001)))
list_tunes[[i]] <- coef(train$finalModel, train$finalModel$lambdaOpt)
}
table_occurrence <-matrix(0,100,length(rownames(train_tpm)))
colnames(table_occurrence) <-rownames(train_tpm)
rownames(table_occurrence) <-paste0("run_",1:100)
for(i in 1:nrow(table_occurrence)){
tmp=list_tunes[[i]][,1][-1]
table_occurrence[i,names(tmp)] <-tmp
}
meanvalues <-table_occurrence
table_occurrence[table_occurrence != 0] <-1
tbO <-melt(table_occurrence)
ggplot(tbO, aes(y=Var2, x=Var1, fill=value))+
geom_tile()+
theme(axis.text.y = element_text(size=12))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size=8))+
xlab("Runs")+
ylab("Genes")+
scale_fill_continuous(high = "red", low = "blue")
(selected <-sort(colSums(table_occurrence)[colSums(table_occurrence) >= 50],decreasing = TRUE))
## TNFSF11 LOXL3 DNER TSPAN2 THBS3 ASPN HTRA1 DYSF
## 100 98 98 94 94 85 68 62
selected1 <-names(selected)
meanvalues <-meanvalues[,selected1]
(cf <-colMeans(meanvalues))
## TNFSF11 LOXL3 DNER TSPAN2 THBS3 ASPN HTRA1
## 0.14553148 0.08586524 0.17203360 0.04224252 0.10231979 0.03194829 0.02917054
## DYSF
## 0.03668216
barplot(sort(cf,decreasing = TRUE), col = "dodgerblue", cex.names=1.2,font = 2, yaxt = "n")
axis(side = 2)
ClassTRfull=as.factor(c(rep("N",10),rep("OA",60)))
score_df <-data.frame(patient=colnames(train_tpm), score=NA, class=ClassTRfull)
train_sc <-t(train_tpm[names(cf),])
sc <-rowSums(t(apply(train_sc,1,function(x) x*cf))) #compute score
rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$score <-sc
score_df_train <-score_df
rm(score_df)
colnames(score_df_train)<-c("Patient","Score","Class")
ggplot(score_df_train, aes(x=Score,fill=Class)) +
geom_density(alpha=.7)+
xlab("score R")+
scale_fill_manual(values = c("plum","cyan"))+
theme_bw()
ggplot(score_df_train, aes(y=Score,x=Class, fill=Class))+
geom_boxplot(alpha=.6)+
stat_compare_means(cex=7,label.x=0.75,label.y = 3)+
geom_jitter((aes(colour = Class))) +
ylab("score R")+
scale_fill_manual(values = c("plum","cyan"))+
scale_color_manual(values = c("deeppink","turquoise"))+
theme_bw()+
theme(axis.text.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
legend.title=element_text(size=16),
legend.text=element_text(size=14))+
guides(color = guide_legend(override.aes = list(size = 2)))
col1 <-getGradientColors(score_df_train$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_train$Score)+ 0.01, minRange = min(score_df_train$Score)-0.01)
score_df_train$col1 <-col1
score_df_train <-score_df_train[order(score_df_train$Patient),]
score_df_train$col2 <-c(rep("plum",10),rep("cyan",60))
sum(up_genes$gene %in% rownames(train_tpm))
## [1] 38
sum(dw_genes$gene %in% rownames(train_tpm))
## [1] 5
sel <-c(up_genes$gene,dw_genes$gene)[c(up_genes$gene,dw_genes$gene) %in% rownames(train_tpm)] #43
train_tpm <-train_tpm[sel,]
annGenes <-data.frame(genes=sel, col3=c(rep("red",38),rep("blue",5)))
summ=summary(score_df_train$Score)
col_fun = colorRamp2(c(summ[1],summ[2],summ[5],summ[6]), c("#00BFFF","#197CDD","#0F4DB3","#000080"))
score_df_train <-score_df_train[order(score_df_train$Score,decreasing = FALSE),] #order the patients for increasing Score
head(score_df_train)
## Patient Score Class col1 col2
## MRI009 MRI009 1.340591 N #00BFFF plum
## MRI008 MRI008 1.443142 N #03B9FF plum
## MRI002 MRI002 1.489666 N #04B7FF plum
## MRI005 MRI005 1.532074 N #06B5FF plum
## MRI004 MRI004 1.672486 N #0AAEFF plum
## MRI006 MRI006 1.769805 N #0DAAFF plum
train_tpm_scal <-t(apply(train_tpm,1,scale))
rownames(train_tpm_scal) <-rownames(train_tpm)
colnames(train_tpm_scal) <-colnames(train_tpm)
train_tpm_scal[train_tpm_scal < -3] <- -3
train_tpm_scal[train_tpm_scal > 3] <- 3
heatmap3(train_tpm_scal[annGenes$genes,rownames(score_df_train)],
Colv=NA,
scale="none",
col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
RowAxisColors = 1,
balanceColor = TRUE,
margins = c(15,15),
ColSideColors = as.matrix(score_df_train[,c(4,5)]),
ColSideLabs = c("score R","Status"),
RowSideColors = annGenes$col3,
RowSideLabs = "Consensus",
cexRow = 1, cexCol = 1)
legend(0.4,1,legend=c("OA","Normal"),cex=2,
fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,1,legend=c("UP","DOWN"),cex=2,
fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun, title = "score R"), x = unit(19, "in"), y = unit(13, "in"))
ClassV=factor(c(rep("N",18),rep("OA",20)))
score_df <-data.frame(patient=colnames(validation_tpm), score=NA, class=ClassV)
val_sc <-t(validation_tpm[names(cf),])
identical(names(cf), colnames(val_sc)) #TRUE
## [1] TRUE
sc <-rowSums(t(apply(val_sc,1,function(x) x*cf)))
rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$score <-sc
head(sc)
## Normal_Cart_10_8 Normal_Cart_2_2 Normal_Cart_3_3 Normal_Cart_4_4
## 2.665540 3.051013 2.073371 3.084911
## Normal_Cart_5_5 Normal_Cart_6_6
## 1.887718 2.122148
score_df_val <-score_df
rm(score_df)
colnames(score_df_val)<-c("Patient","Score","Class")
ggplot(score_df_val, aes(x=Score,fill=Class)) +
geom_density(alpha=.7)+
xlab("score R")+
scale_fill_manual(values = c("plum","cyan"))+
theme_bw()
ggplot(score_df_val, aes(y=Score,x=Class, fill=Class))+
geom_boxplot(alpha=.6)+
stat_compare_means(cex=7,label.x=0.75,label.y = 4)+
geom_jitter((aes(colour = Class))) +
ylab("score R")+
scale_fill_manual(values = c("plum","cyan"))+
scale_color_manual(values = c("deeppink","turquoise"))+
theme_bw()+
theme(axis.text.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
legend.title=element_text(size=16),
legend.text=element_text(size=14))+
guides(color = guide_legend(override.aes = list(size = 2)))
col1 <-getGradientColors(score_df_val$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_val$Score)+ 0.01, minRange = min(score_df_val$Score)-0.01)
score_df_val$col1 <-col1
score_df_val <-score_df_val[order(score_df_val$Patient),]
score_df_val$col2 <-c(rep("plum",18),rep("cyan",20))
validation_tpm <-validation_tpm[sel,]
summ2=summary(score_df_val$Score)
col_fun2 = colorRamp2(c(summ2[1],summ2[2],summ2[5],summ2[6]), c("#00BFFF","#1C88F2","#0D40AB","#000080"))
score_df_val <-score_df_val[order(score_df_val$Score,decreasing = FALSE),] #order the patients for increasing Score
head(score_df_val)
## Patient Score Class col1 col2
## normal_06 normal_06 0.6873741 N #00BFFF plum
## normal_02 normal_02 1.1900648 N #0BACFF plum
## normal_04 normal_04 1.2291431 N #0CABFF plum
## normal_10 normal_10 1.4077748 N #10A5FF plum
## Normal_Cart_5_5 Normal_Cart_5_5 1.8877178 N #1B93FF plum
## normal_07 normal_07 1.9662685 N #1D91FF plum
validation_tpm_scal <-t(apply(validation_tpm,1,scale))
rownames(validation_tpm_scal) <-rownames(validation_tpm)
colnames(validation_tpm_scal) <-colnames(validation_tpm)
validation_tpm_scal[validation_tpm_scal < -3] <- -3
validation_tpm_scal[validation_tpm_scal > 3] <- 3
heatmap3(validation_tpm_scal[annGenes$genes,rownames(score_df_val)],
Colv=NA,
scale="none",
col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
RowAxisColors = 1,
balanceColor = TRUE,
margins = c(10,10),
ColSideColors = as.matrix(score_df_val[,c(4,5)]),
ColSideLabs = c("score R","Status"),
RowSideColors = annGenes$col3,
RowSideLabs = "Consensus",
cexRow = 1, cexCol = 1)
legend(0.4,1,legend=c("OA","Normal"),cex=2,
fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,1,legend=c("UP","DOWN"),cex=2,
fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun2, title = "score R"), x = unit(19, "in"), y = unit(13, "in"))
score_df_val <-score_df_val[order(score_df_val$Class),]
rocobj <- roc(ClassV,score_df_val$Score)
auc <-round(rocobj$auc,3)
ggroc(rocobj, size = 2, col="dodgerblue",legacy.axes = TRUE) +
theme_bw()+
ggtitle(paste0('ROC Curve for score R on Validation dataset ', '(AUC = ', auc, ')'))+
labs(x = "1 - Specificity",y = "Sensitivity")
rocobj_val <-rocobj
validation_tpm <-validation_tpm_forRidge[rownames(validation_tpm_forRidge) %in% c(up_genes$gene,dw_genes$gene),]
train_tpm <-train_tpm_forRidge[rownames(train_tpm_forRidge) %in% rownames(validation_tpm),]
trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
verboseIter = TRUE, returnData = FALSE,
savePredictions = TRUE,
classProbs = TRUE)
nCores <- 100 #change number if needed
registerDoMC(nCores)
train_dataset <-t(train_tpm)
train <- train(train_dataset,
y = ClassTRfull,
method = "glmnet",
trControl = trainCtrl.rpcv,
metric = "Accuracy", allowParallel=TRUE,
tuneGrid = expand.grid(alpha = 0, #ridge
lambda = seq(0.001,1,by = 0.001)))
cf_all <-coef(train$finalModel, train$finalModel$lambdaOpt)[,1][-1]
score_df_all_tr <-data.frame(patient=colnames(train_tpm), score=NA, class=ClassTRfull)
train_sc_all <-t(train_tpm[names(cf_all),])
sc1 <-rowSums(t(apply(train_sc_all,1,function(x) x*cf_all)))
rownames(score_df_all_tr) <-score_df_all_tr[,1]
score_df_all_tr <-score_df_all_tr[names(sc1),]
score_df_all_tr$score <-sc1
head(sc1)
## MRI001 MRI002 MRI003 MRI004 MRI005 MRI006
## 7.028785 2.452862 4.865943 3.555113 4.605602 3.928878
score_df_all_val <-data.frame(patient=colnames(validation_tpm), score=NA, class=ClassV)
val_sc_all <-t(validation_tpm[names(cf_all),])
sc <-rowSums(t(apply(val_sc_all,1,function(x) x*cf_all)))
rownames(score_df_all_val) <-score_df_all_val[,1]
score_df_all_val <-score_df_all_val[names(sc),]
score_df_all_val$score <-sc
head(sc)
## Normal_Cart_10_8 Normal_Cart_2_2 Normal_Cart_3_3 Normal_Cart_4_4
## 5.669915 7.437420 3.869674 6.863030
## Normal_Cart_5_5 Normal_Cart_6_6
## 5.256593 6.175890
score_df_all_val <-score_df_all_val[order(score_df_all_val$score, decreasing=FALSE),]
rocobj_val_all<- roc(score_df_all_val$class,score_df_all_val$score)
auc <-round(rocobj_val_all$auc,3)
ggroc(rocobj_val_all, size = 2, col="dodgerblue", legacy.axes = TRUE) +
theme_bw()+
ggtitle(paste0('ROC Curve for sT on Validation dataset ', '(AUC = ', auc, ')'))+
labs(x = "1 - Specificity",y = "Sensitivity")
## Compare the ROC curves on validation dataset to assess the efficacy
of the sR prediction and apply DeLong’s test
list_rocs <-list(rocobj_val,rocobj_val_all)
names(list_rocs) <-c("Score R","Score T")
auc <- lapply(list_rocs, function(x) round(x$auc,3))
auc[1]
## $`Score R`
## [1] 0.875
auc[2]
## $`Score T`
## [1] 0.922
(rocts <-roc.test(rocobj_val,rocobj_val_all))
##
## DeLong's test for two ROC curves
##
## data: rocobj_val and rocobj_val_all
## D = -0.66741, df = 69.259, p-value = 0.5067
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.8750000 0.9222222
ggroc(list_rocs, size = 2,legacy.axes = TRUE) +
theme_bw()+
annotate("text", 0.75, 0.44, label=expression("AUC 8 features - score s"["R"] : 0.875) ,size=7)+
annotate("text", 0.75,0.4, label=expression("AUC 43 features - score s"["T"] : 0.922) ,size=7)+
labs(x = "1 - Specificity",y = "Sensitivity",color='Models')+
theme(legend.title=element_text(size=16),legend.text=element_text(size=14))+
guides(Models = guide_legend(override.aes = list(size = 4)))
list_scores <-list(score_df_all_tr, score_df_all_val, score_df_train, score_df_val)
names(list_scores) <-c("scoreT Training","scoreT Valid","scoreR Training","scoreR Valid")
save(list_scores,file=paste0(MEDIAsave,"list_scores.RData"))